Because I currently don’t have data in hand, neither do our lab members. Thus, I downloaded the COVID-19 data from WHO (https://covid19.who.int/table).This project is focuesed on the COVID-19 data in different countries from January to November in order to explore the pandemic situation. Although a lot of online sources had already done a lot of analysis of the COVID-19 data and they made good visualization plots. However, it’s a good practice for me how to manipulate data, clean data, and mutate data. In this project, line plot, barplot, facet, heatmap and animation were used.
##install packages and load in libraries
library(ggplot2)
##install.packages("lubridate")
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
library(viridis)
## Loading required package: viridisLite
library(gganimate)
getwd()
## [1] "/Users/lijiaxuan/Downloads"
##load in data
data = read.csv("WHO-COVID-19-global-data.csv")
##take a look at the data
str(data)
## 'data.frame': 74495 obs. of 8 variables:
## $ Date_reported : chr "2020-01-03" "2020-01-04" "2020-01-05" "2020-01-06" ...
## $ Country_code : chr "AF" "AF" "AF" "AF" ...
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ WHO_region : chr "EMRO" "EMRO" "EMRO" "EMRO" ...
## $ New_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cumulative_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ New_deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cumulative_deaths: int 0 0 0 0 0 0 0 0 0 0 ...
glimpse(data)
## Rows: 74,495
## Columns: 8
## $ Date_reported <chr> "2020-01-03", "2020-01-04", "2020-01-05", "2020-01-…
## $ Country_code <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ Country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afgha…
## $ WHO_region <chr> "EMRO", "EMRO", "EMRO", "EMRO", "EMRO", "EMRO", "EM…
## $ New_cases <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Cumulative_cases <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ New_deaths <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Cumulative_deaths <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
head(data)
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## 1 2020-01-03 AF Afghanistan EMRO 0 0
## 2 2020-01-04 AF Afghanistan EMRO 0 0
## 3 2020-01-05 AF Afghanistan EMRO 0 0
## 4 2020-01-06 AF Afghanistan EMRO 0 0
## 5 2020-01-07 AF Afghanistan EMRO 0 0
## 6 2020-01-08 AF Afghanistan EMRO 0 0
## New_deaths Cumulative_deaths
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
summary(data)
## Date_reported Country_code Country WHO_region
## Length:74495 Length:74495 Length:74495 Length:74495
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. :-32952.0 Min. : 0 Min. :-514.00 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.00 1st Qu.: 0
## Median : 1.0 Median : 330 Median : 0.00 Median : 6
## Mean : 713.7 Mean : 57929 Mean : 17.46 Mean : 1985
## 3rd Qu.: 89.0 3rd Qu.: 7584 3rd Qu.: 1.00 3rd Qu.: 133
## Max. :193734.0 Max. :10460365 Max. :6409.00 Max. :241186
class(data$Country)
## [1] "character"
data$Country=as.factor(data$Country)
class(data$Date_reported)
## [1] "character"
class(data$Country_code)
## [1] "character"
Change the date settings since it’s too intense if plot the data on every day, whcih will be used later.
##install "lubridate" package (https://rawgit.com/rstudio/cheatsheets/master/lubridate.pdf), which is specificlly designed for maniplating date related data
##install.packages("lubridate")
library(lubridate)
##change the class of Date_reported column from "character" to "Date" class type
class(data$Date_reported)
## [1] "character"
data$Date_reported = as.Date(data$Date_reported)
class(data$Date_reported)
## [1] "Date"
##change the breaks of the dates, whcih will be used later for plotting
date_breaks <- seq(as.Date("2020-01-03"), as.Date("2020-11-14"), by = "1 month" )
##Filter function is in the dplyr package
##library(dplyr)
##Subset the covid-19 data for China
data_china = filter(data, data$Country_code == "CN")
dim(data_china)
## [1] 317 8
head(data_china)
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## 1 2020-01-03 CN China WPRO 0 0
## 2 2020-01-04 CN China WPRO 1 1
## 3 2020-01-05 CN China WPRO 0 1
## 4 2020-01-06 CN China WPRO 3 4
## 5 2020-01-07 CN China WPRO 0 4
## 6 2020-01-08 CN China WPRO 0 4
## New_deaths Cumulative_deaths
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
##Make the first line plot!
##Plot the COVID-19 data in China from January to December.
ggplot(data_china, aes(x=Date_reported, y=Cumulative_cases)) +
geom_line() +
labs(title="Cumulative_cases in China")
##Plot the Cumulative_cases in China from January to Novermber.
gp_china_1 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_cases)) +
geom_point(color='lightgreen', size = 0.5) +
labs(title="Cumulative_cases in China") +
geom_line()+
scale_x_date(breaks = date_breaks) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.2 , hjust=0.2))
gp_china_1
##Plot the new_cases in China from January to December.
gp_china_2 = ggplot(data_china, aes(x=Date_reported, y=New_cases)) +
geom_point(color='lightblue', size = 0.5) +
labs(title="New_cases in China") +
geom_line()+
scale_x_date(breaks = date_breaks) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
gp_china_2
##Plot the new deaths in China from January to December.
gp_china_3 = ggplot(data_china, aes(x=Date_reported, y=New_deaths)) +
geom_point(color='lightblue', size = 0.5) +
labs(title="New deaths in China") +
geom_line()+
scale_x_date(breaks = date_breaks) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
gp_china_3
##Plot the cumulative deaths in China from January to December.
gp_china_4 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_deaths)) +
geom_point(color='lightgreen', size = 0.5) +
labs(title="Cumulative_deaths in China") +
geom_line()+
scale_x_date(breaks = date_breaks) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
gp_china_4
##Make multiple plots for the covid-19 data in China at the same time using a for loop
cases_cn = names(data_china[,c(5:8)])
cases_cn
## [1] "New_cases" "Cumulative_cases" "New_deaths"
## [4] "Cumulative_deaths"
for (i in c(5:8)){
colnames = data_china[,i]
print(ggplot(data_china, aes(x = Date_reported, y = colnames)) +
geom_line(col = "red") +
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
ylab(cases_cn[i-4]) ##name the y axis accordingly.
)
}
##Make a facet (https://www.datanovia.com/en/lessons/combine-multiple-ggplots-into-a-figure/) for the COVID-19 data in China
library(ggpubr)
ggarrange(gp_china_1, gp_china_2, gp_china_3, gp_china_4, ncol = 2, nrow = 2) + geom_jitter()
##Here is a heatmap for the Cumulative_cases in China.
ggplot(data_china, aes(y = Date_reported , x = Country, fill = Cumulative_cases))+
scale_y_date(breaks = date_breaks) + geom_tile() + scale_fill_viridis_c()
###Make barplots for data in China;
## Overlay a line plot with a box plot for the Cumulative_cases and New_cases in China
bp_china1 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_cases)) +
geom_bar(stat = "identity", color = "lightblue") +
labs(title="Cumulative_cases in China") +
geom_line(aes(x=Date_reported, y=Cumulative_cases, color = "Cumulative_cases")) +
geom_line(aes(x=Date_reported, y=New_cases, color = "New_cases")) +
scale_color_manual(name = "COVID-19 deaths", values = c("Cumulative_cases" = "yellow", "New_cases" = "red" )) ##add the legend manually.
bp_china1
## Overlay a line plot with a box plot for the Cumulative_deaths and New_deaths in China
bp_china2 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_deaths)) +
geom_bar(stat = "identity", color = "lightblue") +
labs(title="Cumulative_deaths in China") +
geom_line( aes(x=Date_reported, y=Cumulative_deaths, color = "Cumulative_deaths")) +
geom_line(aes(x=Date_reported, y=New_deaths, color = "New_deaths")) +
scale_color_manual(name = "COVID-19 deaths", values = c("Cumulative_deaths" = "yellow", "New_deaths" = "red")) ##add the legend manually.
bp_china2
##make a facet for the barplots for the COVID-19 data in China
ggarrange(bp_china1, bp_china2, ncol = 2, nrow = 1)
##Subset the covid-19 data for US
data_US = filter(data, data$Country_code == "US")
##Make a simple line plot for the Cumulative_cases in the US
ggplot(data_US, aes(x=Date_reported, y=Cumulative_cases)) +
geom_line(col = "red") +
theme_classic() +
labs(title="Cumulative_cases in US") +
scale_x_date(breaks = date_breaks) +
theme(legend.position = "none",plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 45, vjust = 0, hjust=0))
##Make multiple line plots for the covid-19 data in the US using a for loop
cases = names(data_US[,c(5:8)])
cases
## [1] "New_cases" "Cumulative_cases" "New_deaths"
## [4] "Cumulative_deaths"
for (i in c(5:8)){
colnames = data_US[,i]
print(ggplot(data_US, aes(x = Date_reported, y = colnames)) +
geom_line(col = "red") +
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
ylab(cases[i-4]) ##name the y axis accordingly.
)
}
##Make a simple barplot for the Cumulative_cases in the US
ggplot(data_US, aes(x=Date_reported, y=Cumulative_cases)) +
geom_bar(stat = "identity", color = "lightblue") +
theme_classic() +
labs(title="Cumulative_cases in US") +
scale_x_date(breaks = date_breaks) +
theme(legend.position = "none",plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 45, vjust = 0, hjust=0))
##Make multiple bar plots for the covid-19 data in the US using a for loop
cases = names(data_US[,c(5:8)])
cases
## [1] "New_cases" "Cumulative_cases" "New_deaths"
## [4] "Cumulative_deaths"
for (i in c(5:8)){
colnames = data_US[,i]
print(ggplot(data_US, aes(x = Date_reported, y = colnames)) +
geom_bar(stat = "identity", color = "lightblue") +
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
ylab(cases[i-4]) ##name the y axis accordingly.
)
}
## 4.3 Make a heatmap
##Here is a heatmap for the Cumulative_cases in the US.
ggplot(data_US, aes(y = Date_reported , x = Country, fill = Cumulative_cases))+
scale_y_date(breaks = date_breaks) + geom_tile() + scale_fill_viridis_c()
##Make a heatmap for all the countries in the original dataset
library(viridis)
ggplot(data, aes(x = Country, y =Date_reported, fill = Cumulative_cases))+geom_tile()
library(dplyr)
##The x axis is too intense, thus I will choose 5 countries randomly from the dataset and select them according to the country code.
selected_country = filter(data, Country_code == c("CN","US","GB","FR","CA"))
head(selected_country)
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## 1 2020-01-03 CA Canada AMRO 0 0
## 2 2020-01-08 CA Canada AMRO 0 0
## 3 2020-01-13 CA Canada AMRO 0 0
## 4 2020-01-18 CA Canada AMRO 0 0
## 5 2020-01-23 CA Canada AMRO 0 0
## 6 2020-01-28 CA Canada AMRO 1 7
## New_deaths Cumulative_deaths
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
###Make the heatmap to illustrate the Cumulative_cases in 5 selected countries.
ggplot(selected_country, aes(y = Date_reported , x = Country, fill = Cumulative_cases))+
scale_y_date(breaks = date_breaks) + geom_tile() + scale_fill_viridis_c() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
##Make factes for several selected countries.
ggplot(selected_country, aes(x = Date_reported, y = Cumulative_deaths, color = "blue")) +
geom_line()+
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
facet_wrap(~Country, nrow = 2)
library(gganimate)
##plot a single geom_point plot for the selected_country dataset
Animation = ggplot(selected_country, aes(x = New_cases, y = Cumulative_cases,color = Country))+
geom_point(show.legend = FALSE, alpha = 0.7, size = 2.0) +
scale_color_viridis_d() +
scale_size(range = c(1, 5))
Animation
##Transition by time
Animation + transition_time(Date_reported) + labs (title = "Date:{frame_time}")
##+ scale_x_date(breaks = date_breaks)
##Make a facet by country
Animation + facet_wrap(~Country) +
transition_time(Date_reported) +
labs (title = "Date:{frame_time}") +
shadow_mark(alpha = 1, size = 0.5)+
view_follow(fixed_y = TRUE)